data <- read.csv('C:/Users/patel/Desktop/Main folder/Brest cancer/data.csv')
head(data)
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 842302 M 17.99 10.38 122.80 1001.0
## 2 842517 M 20.57 17.77 132.90 1326.0
## 3 84300903 M 19.69 21.25 130.00 1203.0
## 4 84348301 M 11.42 20.38 77.58 386.1
## 5 84358402 M 20.29 14.34 135.10 1297.0
## 6 843786 M 12.45 15.70 82.57 477.1
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1 0.11840 0.27760 0.3001 0.14710
## 2 0.08474 0.07864 0.0869 0.07017
## 3 0.10960 0.15990 0.1974 0.12790
## 4 0.14250 0.28390 0.2414 0.10520
## 5 0.10030 0.13280 0.1980 0.10430
## 6 0.12780 0.17000 0.1578 0.08089
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1 0.2419 0.07871 1.0950 0.9053 8.589
## 2 0.1812 0.05667 0.5435 0.7339 3.398
## 3 0.2069 0.05999 0.7456 0.7869 4.585
## 4 0.2597 0.09744 0.4956 1.1560 3.445
## 5 0.1809 0.05883 0.7572 0.7813 5.438
## 6 0.2087 0.07613 0.3345 0.8902 2.217
## area_se smoothness_se compactness_se concavity_se concave.points_se
## 1 153.40 0.006399 0.04904 0.05373 0.01587
## 2 74.08 0.005225 0.01308 0.01860 0.01340
## 3 94.03 0.006150 0.04006 0.03832 0.02058
## 4 27.23 0.009110 0.07458 0.05661 0.01867
## 5 94.44 0.011490 0.02461 0.05688 0.01885
## 6 27.19 0.007510 0.03345 0.03672 0.01137
## symmetry_se fractal_dimension_se radius_worst texture_worst
## 1 0.03003 0.006193 25.38 17.33
## 2 0.01389 0.003532 24.99 23.41
## 3 0.02250 0.004571 23.57 25.53
## 4 0.05963 0.009208 14.91 26.50
## 5 0.01756 0.005115 22.54 16.67
## 6 0.02165 0.005082 15.47 23.75
## perimeter_worst area_worst smoothness_worst compactness_worst
## 1 184.60 2019.0 0.1622 0.6656
## 2 158.80 1956.0 0.1238 0.1866
## 3 152.50 1709.0 0.1444 0.4245
## 4 98.87 567.7 0.2098 0.8663
## 5 152.20 1575.0 0.1374 0.2050
## 6 103.40 741.6 0.1791 0.5249
## concavity_worst concave.points_worst symmetry_worst
## 1 0.7119 0.2654 0.4601
## 2 0.2416 0.1860 0.2750
## 3 0.4504 0.2430 0.3613
## 4 0.6869 0.2575 0.6638
## 5 0.4000 0.1625 0.2364
## 6 0.5355 0.1741 0.3985
## fractal_dimension_worst X
## 1 0.11890 NA
## 2 0.08902 NA
## 3 0.08758 NA
## 4 0.17300 NA
## 5 0.07678 NA
## 6 0.12440 NA
data$X <- NULL
data <- data[,-1]
data$diagnosis <- as.factor(ifelse(data$diagnosis =='B',"Benign","Malignant"))
summary(data)
## diagnosis radius_mean texture_mean perimeter_mean
## Benign :357 Min. : 6.981 Min. : 9.71 Min. : 43.79
## Malignant:212 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
## Median :13.370 Median :18.84 Median : 86.24
## Mean :14.127 Mean :19.29 Mean : 91.97
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
## Max. :28.110 Max. :39.28 Max. :188.50
## area_mean smoothness_mean compactness_mean concavity_mean
## Min. : 143.5 Min. :0.05263 Min. :0.01938 Min. :0.00000
## 1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956
## Median : 551.1 Median :0.09587 Median :0.09263 Median :0.06154
## Mean : 654.9 Mean :0.09636 Mean :0.10434 Mean :0.08880
## 3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070
## Max. :2501.0 Max. :0.16340 Max. :0.34540 Max. :0.42680
## concave.points_mean symmetry_mean fractal_dimension_mean
## Min. :0.00000 Min. :0.1060 Min. :0.04996
## 1st Qu.:0.02031 1st Qu.:0.1619 1st Qu.:0.05770
## Median :0.03350 Median :0.1792 Median :0.06154
## Mean :0.04892 Mean :0.1812 Mean :0.06280
## 3rd Qu.:0.07400 3rd Qu.:0.1957 3rd Qu.:0.06612
## Max. :0.20120 Max. :0.3040 Max. :0.09744
## radius_se texture_se perimeter_se area_se
## Min. :0.1115 Min. :0.3602 Min. : 0.757 Min. : 6.802
## 1st Qu.:0.2324 1st Qu.:0.8339 1st Qu.: 1.606 1st Qu.: 17.850
## Median :0.3242 Median :1.1080 Median : 2.287 Median : 24.530
## Mean :0.4052 Mean :1.2169 Mean : 2.866 Mean : 40.337
## 3rd Qu.:0.4789 3rd Qu.:1.4740 3rd Qu.: 3.357 3rd Qu.: 45.190
## Max. :2.8730 Max. :4.8850 Max. :21.980 Max. :542.200
## smoothness_se compactness_se concavity_se
## Min. :0.001713 Min. :0.002252 Min. :0.00000
## 1st Qu.:0.005169 1st Qu.:0.013080 1st Qu.:0.01509
## Median :0.006380 Median :0.020450 Median :0.02589
## Mean :0.007041 Mean :0.025478 Mean :0.03189
## 3rd Qu.:0.008146 3rd Qu.:0.032450 3rd Qu.:0.04205
## Max. :0.031130 Max. :0.135400 Max. :0.39600
## concave.points_se symmetry_se fractal_dimension_se
## Min. :0.000000 Min. :0.007882 Min. :0.0008948
## 1st Qu.:0.007638 1st Qu.:0.015160 1st Qu.:0.0022480
## Median :0.010930 Median :0.018730 Median :0.0031870
## Mean :0.011796 Mean :0.020542 Mean :0.0037949
## 3rd Qu.:0.014710 3rd Qu.:0.023480 3rd Qu.:0.0045580
## Max. :0.052790 Max. :0.078950 Max. :0.0298400
## radius_worst texture_worst perimeter_worst area_worst
## Min. : 7.93 Min. :12.02 Min. : 50.41 Min. : 185.2
## 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11 1st Qu.: 515.3
## Median :14.97 Median :25.41 Median : 97.66 Median : 686.5
## Mean :16.27 Mean :25.68 Mean :107.26 Mean : 880.6
## 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40 3rd Qu.:1084.0
## Max. :36.04 Max. :49.54 Max. :251.20 Max. :4254.0
## smoothness_worst compactness_worst concavity_worst concave.points_worst
## Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145 1st Qu.:0.06493
## Median :0.13130 Median :0.21190 Median :0.2267 Median :0.09993
## Mean :0.13237 Mean :0.25427 Mean :0.2722 Mean :0.11461
## 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829 3rd Qu.:0.16140
## Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29100
## symmetry_worst fractal_dimension_worst
## Min. :0.1565 Min. :0.05504
## 1st Qu.:0.2504 1st Qu.:0.07146
## Median :0.2822 Median :0.08004
## Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :0.6638 Max. :0.20750
str(data)
## 'data.frame': 569 obs. of 31 variables:
## $ diagnosis : Factor w/ 2 levels "Benign","Malignant": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
head(data)
## diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 Malignant 17.99 10.38 122.80 1001.0
## 2 Malignant 20.57 17.77 132.90 1326.0
## 3 Malignant 19.69 21.25 130.00 1203.0
## 4 Malignant 11.42 20.38 77.58 386.1
## 5 Malignant 20.29 14.34 135.10 1297.0
## 6 Malignant 12.45 15.70 82.57 477.1
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1 0.11840 0.27760 0.3001 0.14710
## 2 0.08474 0.07864 0.0869 0.07017
## 3 0.10960 0.15990 0.1974 0.12790
## 4 0.14250 0.28390 0.2414 0.10520
## 5 0.10030 0.13280 0.1980 0.10430
## 6 0.12780 0.17000 0.1578 0.08089
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1 0.2419 0.07871 1.0950 0.9053 8.589
## 2 0.1812 0.05667 0.5435 0.7339 3.398
## 3 0.2069 0.05999 0.7456 0.7869 4.585
## 4 0.2597 0.09744 0.4956 1.1560 3.445
## 5 0.1809 0.05883 0.7572 0.7813 5.438
## 6 0.2087 0.07613 0.3345 0.8902 2.217
## area_se smoothness_se compactness_se concavity_se concave.points_se
## 1 153.40 0.006399 0.04904 0.05373 0.01587
## 2 74.08 0.005225 0.01308 0.01860 0.01340
## 3 94.03 0.006150 0.04006 0.03832 0.02058
## 4 27.23 0.009110 0.07458 0.05661 0.01867
## 5 94.44 0.011490 0.02461 0.05688 0.01885
## 6 27.19 0.007510 0.03345 0.03672 0.01137
## symmetry_se fractal_dimension_se radius_worst texture_worst
## 1 0.03003 0.006193 25.38 17.33
## 2 0.01389 0.003532 24.99 23.41
## 3 0.02250 0.004571 23.57 25.53
## 4 0.05963 0.009208 14.91 26.50
## 5 0.01756 0.005115 22.54 16.67
## 6 0.02165 0.005082 15.47 23.75
## perimeter_worst area_worst smoothness_worst compactness_worst
## 1 184.60 2019.0 0.1622 0.6656
## 2 158.80 1956.0 0.1238 0.1866
## 3 152.50 1709.0 0.1444 0.4245
## 4 98.87 567.7 0.2098 0.8663
## 5 152.20 1575.0 0.1374 0.2050
## 6 103.40 741.6 0.1791 0.5249
## concavity_worst concave.points_worst symmetry_worst
## 1 0.7119 0.2654 0.4601
## 2 0.2416 0.1860 0.2750
## 3 0.4504 0.2430 0.3613
## 4 0.6869 0.2575 0.6638
## 5 0.4000 0.1625 0.2364
## 6 0.5355 0.1741 0.3985
## fractal_dimension_worst
## 1 0.11890
## 2 0.08902
## 3 0.08758
## 4 0.17300
## 5 0.07678
## 6 0.12440
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 3.4.4
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
chart.Correlation(data[,c(2:11)],histogram = T, col="blue",main ="Cancer Mean")
chart.Correlation(data[,c(12:21)], method="pearson",hist.col = "#1fbbfa",main="Cancer SE")
chart.Correlation(data[,c(22:31)], method="pearson",hist.col = "#1fbbfa",main="Cancer worst")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(GGally)
## Warning: package 'GGally' was built under R version 3.4.4
ggpairs(data[,c(2:11,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer Mean")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggpairs(data[,c(12:21,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer SE")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggpairs(data[,c(22:31,1)], aes(color=diagnosis, alpha=0.75), lower=list(continuous="smooth"))+ theme_bw()+
labs(title="Cancer Worst")+
theme(plot.title=element_text(face='bold',color='black',hjust=0.5,size=12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.4.4
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
data_pca <- transform(data)
all_pca <- prcomp(data_pca[,-1],cor=TRUE,scale= TRUE)
## Warning: In prcomp.default(data_pca[, -1], cor = TRUE, scale = TRUE) :
## extra argument 'cor' will be disregarded
summary(all_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion 0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion 0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
mean_pca <- prcomp(data_pca[,c(2:11)], cor = TRUE , scale = TRUE)
## Warning: In prcomp.default(data_pca[, c(2:11)], cor = TRUE, scale = TRUE) :
## extra argument 'cor' will be disregarded
summary(mean_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.3406 1.5870 0.93841 0.7064 0.61036 0.35234
## Proportion of Variance 0.5479 0.2519 0.08806 0.0499 0.03725 0.01241
## Cumulative Proportion 0.5479 0.7997 0.88779 0.9377 0.97495 0.98736
## PC7 PC8 PC9 PC10
## Standard deviation 0.28299 0.18679 0.10552 0.01680
## Proportion of Variance 0.00801 0.00349 0.00111 0.00003
## Cumulative Proportion 0.99537 0.99886 0.99997 1.00000
se_pca <- prcomp(data_pca[,c(12:21)], cor = TRUE , scale = TRUE)
## Warning: In prcomp.default(data_pca[, c(12:21)], cor = TRUE, scale = TRUE) :
## extra argument 'cor' will be disregarded
summary(se_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.1779 1.4406 1.1245 0.77095 0.75991 0.57939
## Proportion of Variance 0.4743 0.2075 0.1264 0.05944 0.05775 0.03357
## Cumulative Proportion 0.4743 0.6819 0.8083 0.86774 0.92548 0.95905
## PC7 PC8 PC9 PC10
## Standard deviation 0.43512 0.3962 0.20436 0.14635
## Proportion of Variance 0.01893 0.0157 0.00418 0.00214
## Cumulative Proportion 0.97798 0.9937 0.99786 1.00000
worst_pca <- prcomp(data_pca[,c(22:31)], cor = TRUE , scale = TRUE)
## Warning: In prcomp.default(data_pca[, c(22:31)], cor = TRUE, scale = TRUE) :
## extra argument 'cor' will be disregarded
summary(worst_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.3869 1.4443 0.89597 0.73531 0.71741 0.42862
## Proportion of Variance 0.5697 0.2086 0.08028 0.05407 0.05147 0.01837
## Cumulative Proportion 0.5697 0.7783 0.85860 0.91267 0.96413 0.98251
## PC7 PC8 PC9 PC10
## Standard deviation 0.28959 0.26802 0.12343 0.06326
## Proportion of Variance 0.00839 0.00718 0.00152 0.00040
## Cumulative Proportion 0.99089 0.99808 0.99960 1.00000
fviz_eig(all_pca, addlabels = T , ylim = c(0,60), geom = c("bar","line"),
barfill = 'Red', barcolor = "grey", linecolor = 'black', ncp=10)+
labs(title = 'Cancer All variance -PCA',x ='Principle Components', y="% of variance")
fviz_eig(mean_pca, addlabels = T , ylim = c(0,60), geom = c("bar","line"),
barfill = 'Pink', barcolor = "grey", linecolor = 'black', ncp=10)+
labs(title = 'Cancer All variance -PCA',x ='Principle Components', y="% of variance")
fviz_eig(se_pca, addlabels = T , ylim = c(0,60), geom = c("bar","line"),
barfill = 'Pink', barcolor = "grey", linecolor = 'black', ncp=10)+
labs(title = 'Cancer All variance -PCA',x ='Principle Components', y="% of variance")
fviz_eig(worst_pca, addlabels = T , ylim = c(0,60), geom = c("bar","line"),
barfill = 'Pink', barcolor = "grey", linecolor = 'black', ncp=10)+
labs(title = 'Cancer All variance -PCA',x ='Principle Components', y="% of variance")
all_var <- get_pca_var(all_pca)
all_var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
library("corrplot")
## corrplot 0.84 loaded
corrplot(all_var$cos2, is.corr=FALSE)
corrplot(all_var$contrib, is.corr=FALSE)
library(gridExtra)
p1 <- fviz_contrib(all_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(all_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)
mean_var <- get_pca_var(mean_pca)
mean_var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
corrplot(mean_var$cos2, is.corr=FALSE)
Contributions of variables to PCA To highlight the most contributing variables for each components
corrplot(mean_var$contrib, is.corr=FALSE)
p1 <- fviz_contrib(mean_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(mean_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)
se_var <- get_pca_var(se_pca)
se_var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
corrplot(se_var$cos2, is.corr=FALSE)
corrplot(se_var$contrib, is.corr=FALSE)
p1 <- fviz_contrib(se_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(se_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)
worst_var <- get_pca_var(worst_pca)
worst_var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
corrplot(worst_var$cos2, is.corr=FALSE)
corrplot(worst_var$contrib, is.corr=FALSE)
p1 <- fviz_contrib(worst_pca, choice="var", axes=1, fill="pink", color="grey", top=10)
p2 <- fviz_contrib(worst_pca, choice="var", axes=2, fill="skyblue", color="grey", top=10)
grid.arrange(p1,p2,ncol=2)
set.seed(100)
res.all <- kmeans(all_var$coord , centers = 6, nstart = 25)
grp <- as.factor(res.all$cluster)
fviz_pca_var(all_pca,col.var = grp, palette = "jco",legend.title = "Cluster")
res.mean <- kmeans(mean_var$coord , centers = 3, nstart = 25)
grp <- as.factor(res.mean$cluster)
fviz_pca_var(mean_pca,col.var = grp, palette = "jco",legend.title = "Cluster")
res.se <- kmeans(se_var$coord , centers = 4, nstart = 25)
grp <- as.factor(res.se$cluster)
fviz_pca_var(se_pca,col.var = grp, palette = "jco",legend.title = "Cluster")
res.worst <- kmeans(worst_var$coord , centers = 3, nstart = 25)
grp <- as.factor(res.worst$cluster)
fviz_pca_var(worst_pca,col.var = grp, palette = "jco",legend.title = "Cluster")
fviz_pca_biplot(all_pca,col.ind = data$diagnosis,col = "black",
palette = "jco",geom = "point",repel = T,
legend.title = "Diagnosis",addEllipses = T)
fviz_pca_biplot(mean_pca,col.ind = data$diagnosis,col = "black",
palette = "jco",geom = "point",repel = T,
legend.title = "Diagnosis",addEllipses = T)
fviz_pca_biplot(se_pca,col.ind = data$diagnosis,col = "black",
palette = "jco",geom = "point",repel = T,
legend.title = "Diagnosis",addEllipses = T)
fviz_pca_biplot(worst_pca,col.ind = data$diagnosis,col = "black",
palette = "jco",geom = "point",repel = T,
legend.title = "Diagnosis",addEllipses = T)
nrows <- NROW(data)
set.seed(100)
index <- sample(1:nrows,0.7*nrows)
train <- data[index,] ## 398 test data (70%)
test <- data[-index,]
Check the proportion of diagnosis (Benign / Malignant)
library(rpart)
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
learn_rp <- rpart(diagnosis~.,data = train, control = rpart.control(minsplit = 2))
pre_ro <- predict(learn_rp,test[,-1],type = "class")
cm_rp <- confusionMatrix(pre_ro, test$diagnosis)
cm_rp
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 98 7
## Malignant 7 59
##
## Accuracy : 0.9181
## 95% CI : (0.8664, 0.9545)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8273
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9333
## Specificity : 0.8939
## Pos Pred Value : 0.9333
## Neg Pred Value : 0.8939
## Prevalence : 0.6140
## Detection Rate : 0.5731
## Detection Prevalence : 0.6140
## Balanced Accuracy : 0.9136
##
## 'Positive' Class : Benign
##
library(rpart)
library(caret)
learn_rp <- rpart(diagnosis~.,data = train, control = rpart.control(minsplit = 2))
pre_ro <- predict(learn_rp,test[,-1],type = "class")
cm_rp <- confusionMatrix(pre_ro, test$diagnosis)
cm_rp
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 98 7
## Malignant 7 59
##
## Accuracy : 0.9181
## 95% CI : (0.8664, 0.9545)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8273
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9333
## Specificity : 0.8939
## Pos Pred Value : 0.9333
## Neg Pred Value : 0.8939
## Prevalence : 0.6140
## Detection Rate : 0.5731
## Detection Prevalence : 0.6140
## Balanced Accuracy : 0.9136
##
## 'Positive' Class : Benign
##
learn_pru <- prune(learn_rp,cp =learn_rp$cptable[which.min(learn_rp$cptable[,"xerror"]),"CP"])
pre_pru <- predict(learn_pru,test[,-1],type = "class")
cm_pru <- confusionMatrix(pre_pru, test$diagnosis)
cm_pru
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 91 6
## Malignant 14 60
##
## Accuracy : 0.883
## 95% CI : (0.8252, 0.9271)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : 4.171e-15
##
## Kappa : 0.7587
## Mcnemar's Test P-Value : 0.1175
##
## Sensitivity : 0.8667
## Specificity : 0.9091
## Pos Pred Value : 0.9381
## Neg Pred Value : 0.8108
## Prevalence : 0.6140
## Detection Rate : 0.5322
## Detection Prevalence : 0.5673
## Balanced Accuracy : 0.8879
##
## 'Positive' Class : Benign
##
library("RWeka")
## Warning: package 'RWeka' was built under R version 3.4.4
learn_1r <- OneR(diagnosis~., data=train)
pre_1r <- predict(learn_1r, test[,-1])
cm_1r <- confusionMatrix(pre_1r, test$diagnosis)
cm_1r
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 96 8
## Malignant 9 58
##
## Accuracy : 0.9006
## 95% CI : (0.8456, 0.941)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7908
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9143
## Specificity : 0.8788
## Pos Pred Value : 0.9231
## Neg Pred Value : 0.8657
## Prevalence : 0.6140
## Detection Rate : 0.5614
## Detection Prevalence : 0.6082
## Balanced Accuracy : 0.8965
##
## 'Positive' Class : Benign
##
learn_jrip <- JRip(diagnosis ~ ., data=train)
pre_jrip <- predict(learn_jrip, test[,-1])
cm_jrip <- confusionMatrix(pre_jrip, test$diagnosis)
cm_jrip
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 96 3
## Malignant 9 63
##
## Accuracy : 0.9298
## 95% CI : (0.8806, 0.9632)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8544
## Mcnemar's Test P-Value : 0.1489
##
## Sensitivity : 0.9143
## Specificity : 0.9545
## Pos Pred Value : 0.9697
## Neg Pred Value : 0.8750
## Prevalence : 0.6140
## Detection Rate : 0.5614
## Detection Prevalence : 0.5789
## Balanced Accuracy : 0.9344
##
## 'Positive' Class : Benign
##
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:PerformanceAnalytics':
##
## kurtosis, skewness
learn_nb <- naiveBayes(train[,-1], train$diagnosis)
pre_nb <- predict(learn_nb, test[,-1])
cm_nb <- confusionMatrix(pre_nb, test$diagnosis)
cm_nb
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 100 6
## Malignant 5 60
##
## Accuracy : 0.9357
## 95% CI : (0.8878, 0.9675)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8639
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9524
## Specificity : 0.9091
## Pos Pred Value : 0.9434
## Neg Pred Value : 0.9231
## Prevalence : 0.6140
## Detection Rate : 0.5848
## Detection Prevalence : 0.6199
## Balanced Accuracy : 0.9307
##
## 'Positive' Class : Benign
##
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
learn_rf <- randomForest(diagnosis~.,data=train,ntree=500,proximity=T,importance=T)
pre_rf <- predict(learn_rf,test[,-1])
cm_rf <- confusionMatrix(pre_rf,test$diagnosis)
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 100 4
## Malignant 5 62
##
## Accuracy : 0.9474
## 95% CI : (0.9024, 0.9757)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8893
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9524
## Specificity : 0.9394
## Pos Pred Value : 0.9615
## Neg Pred Value : 0.9254
## Prevalence : 0.6140
## Detection Rate : 0.5848
## Detection Prevalence : 0.6082
## Balanced Accuracy : 0.9459
##
## 'Positive' Class : Benign
##
library(party)
## Warning: package 'party' was built under R version 3.4.4
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.4.4
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.4.4
learn_ct <- ctree(diagnosis~., data=train, controls=ctree_control(maxdepth=2))
pre_ct <- predict(learn_ct, test[,-1])
cm_ct <- confusionMatrix(pre_ct, test$diagnosis)
cm_ct
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 99 5
## Malignant 6 61
##
## Accuracy : 0.9357
## 95% CI : (0.8878, 0.9675)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8647
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9429
## Specificity : 0.9242
## Pos Pred Value : 0.9519
## Neg Pred Value : 0.9104
## Prevalence : 0.6140
## Detection Rate : 0.5789
## Detection Prevalence : 0.6082
## Balanced Accuracy : 0.9335
##
## 'Positive' Class : Benign
##
library(class)
acc_test <- numeric()
for(i in 1:30){
predict <- knn(train=train[,-1], test=test[,-1], cl=train[,1], k=i, prob=T)
acc_test <- c(acc_test,mean(predict==test[,1]))
}
acc <- data.frame(k= seq(1,30), cnt = acc_test)
opt_k <- subset(acc, cnt==max(cnt))[1,]
sub <- paste("Optimal number of k is", opt_k$k, "(accuracy :", opt_k$cnt,") in KNN")
library(highcharter)
## Warning: package 'highcharter' was built under R version 3.4.4
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
hchart(acc, 'line', hcaes(k, cnt)) %>%
hc_title(text = "Accuracy With Varying K (KNN)") %>%
hc_subtitle(text = sub) %>%
hc_add_theme(hc_theme_google()) %>%
hc_xAxis(title = list(text = "Number of Neighbors(k)")) %>%
hc_yAxis(title = list(text = "Accuracy"))
pre_knn <- knn(train = train[,-1], test = test[,-1], cl = train[,1], k=opt_k$k, prob=T)
cm_knn <- confusionMatrix(pre_knn, test$diagnosis)
cm_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 99 5
## Malignant 6 61
##
## Accuracy : 0.9357
## 95% CI : (0.8878, 0.9675)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8647
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9429
## Specificity : 0.9242
## Pos Pred Value : 0.9519
## Neg Pred Value : 0.9104
## Prevalence : 0.6140
## Detection Rate : 0.5789
## Detection Prevalence : 0.6082
## Balanced Accuracy : 0.9335
##
## 'Positive' Class : Benign
##
library(gbm)
## Warning: package 'gbm' was built under R version 3.4.4
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
test_gbm <- gbm(diagnosis~., data=train, distribution="gaussian",n.trees = 10000,
shrinkage = 0.01, interaction.depth = 4, bag.fraction=0.5,
train.fraction=0.5,n.minobsinnode=10,cv.folds=3,keep.data=TRUE,verbose=FALSE,n.cores=1)
best.iter <- gbm.perf(test_gbm, method="cv",plot.it=FALSE)
fitControl = trainControl(method="cv", number=5, returnResamp="all")
learn_gbm = train(diagnosis~., data=train, method="gbm", distribution="bernoulli",
trControl=fitControl, verbose=F, tuneGrid=data.frame(.n.trees=best.iter,
.shrinkage=0.01, .interaction.depth=1,
.n.minobsinnode=1))
pre_gbm <- predict(learn_gbm, test[,-1])
cm_gbm <- confusionMatrix(pre_gbm, test$diagnosis)
cm_gbm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 99 3
## Malignant 6 63
##
## Accuracy : 0.9474
## 95% CI : (0.9024, 0.9757)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8899
## Mcnemar's Test P-Value : 0.505
##
## Sensitivity : 0.9429
## Specificity : 0.9545
## Pos Pred Value : 0.9706
## Neg Pred Value : 0.9130
## Prevalence : 0.6140
## Detection Rate : 0.5789
## Detection Prevalence : 0.5965
## Balanced Accuracy : 0.9487
##
## 'Positive' Class : Benign
##
library(rpart)
library(ada)
## Warning: package 'ada' was built under R version 3.4.4
control <- rpart.control(cp = -1, maxdepth = 14,maxcompete = 1,xval = 0)
learn_ada <- ada(diagnosis~., data = train, test.x = train[,-1], test.y = train[,1], type = "gentle", control = control, iter = 70)
pre_ada <- predict(learn_ada, test[,-1])
cm_ada <- confusionMatrix(pre_ada, test$diagnosis)
cm_ada
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 101 4
## Malignant 4 62
##
## Accuracy : 0.9532
## 95% CI : (0.9099, 0.9796)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9013
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9619
## Specificity : 0.9394
## Pos Pred Value : 0.9619
## Neg Pred Value : 0.9394
## Prevalence : 0.6140
## Detection Rate : 0.5906
## Detection Prevalence : 0.6140
## Balanced Accuracy : 0.9506
##
## 'Positive' Class : Benign
##
learn_svm <- svm(diagnosis~., data=train)
pre_svm <- predict(learn_svm, test[,-1])
cm_svm <- confusionMatrix(pre_svm, test$diagnosis)
cm_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 101 2
## Malignant 4 64
##
## Accuracy : 0.9649
## 95% CI : (0.9252, 0.987)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9264
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.9619
## Specificity : 0.9697
## Pos Pred Value : 0.9806
## Neg Pred Value : 0.9412
## Prevalence : 0.6140
## Detection Rate : 0.5906
## Detection Prevalence : 0.6023
## Balanced Accuracy : 0.9658
##
## 'Positive' Class : Benign
##
gamma <- seq(0,0.1,0.005)
cost <- 2^(0:5)
parms <- expand.grid(cost=cost, gamma=gamma) ## 231
acc_test <- numeric()
accuracy1 <- NULL; accuracy2 <- NULL
for(i in 1:NROW(parms)){
learn_svm <- svm(diagnosis~., data=train, gamma=parms$gamma[i], cost=parms$cost[i])
pre_svm <- predict(learn_svm, test[,-1])
accuracy1 <- confusionMatrix(pre_svm, test$diagnosis)
accuracy2[i] <- accuracy1$overall[1]
}
acc <- data.frame(p= seq(1,NROW(parms)), cnt = accuracy2)
opt_p <- subset(acc, cnt==max(cnt))[1,]
sub <- paste("Optimal number of parameter is", opt_p$p, "(accuracy :", opt_p$cnt,") in SVM")
library(highcharter)
hchart(acc, 'line', hcaes(p, cnt)) %>%
hc_title(text = "Accuracy With Varying Parameters (SVM)") %>%
hc_subtitle(text = sub) %>%
hc_add_theme(hc_theme_google()) %>%
hc_xAxis(title = list(text = "Number of Parameters")) %>%
hc_yAxis(title = list(text = "Accuracy"))
learn_imp_svm <- svm(diagnosis~., data=train, cost=parms$cost[opt_p$p], gamma=parms$gamma[opt_p$p])
pre_imp_svm <- predict(learn_imp_svm, test[,-1])
cm_imp_svm <- confusionMatrix(pre_imp_svm, test$diagnosis)
cm_imp_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Benign Malignant
## Benign 104 2
## Malignant 1 64
##
## Accuracy : 0.9825
## 95% CI : (0.9496, 0.9964)
## No Information Rate : 0.614
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9629
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9905
## Specificity : 0.9697
## Pos Pred Value : 0.9811
## Neg Pred Value : 0.9846
## Prevalence : 0.6140
## Detection Rate : 0.6082
## Detection Prevalence : 0.6199
## Balanced Accuracy : 0.9801
##
## 'Positive' Class : Benign
##
col <- c("#ed3b3b", "#0099ff")
par(mfrow=c(3,4))
fourfoldplot(cm_rp$table, color = col, conf.level = 0, margin = 1,
main=paste("RPart (",round(cm_rp$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_pru$table, color = col, conf.level = 0, margin = 1,
main=paste("Prune (",round(cm_pru$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_1r$table, color = col, conf.level = 0, margin = 1,
main=paste("OneR (",round(cm_1r$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_jrip$table, color = col, conf.level = 0, margin = 1,
main=paste("JRip (",round(cm_jrip$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ct$table, color = col, conf.level = 0, margin = 1,
main=paste("CTree (",round(cm_ct$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_nb$table, color = col, conf.level = 0, margin = 1,
main=paste("NaiveBayes (",round(cm_nb$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_knn$table, color = col, conf.level = 0, margin = 1,
main=paste("Tune KNN (",round(cm_knn$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_rf$table, color = col, conf.level = 0, margin = 1,
main=paste("RandomForest (",round(cm_rf$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_gbm$table, color = col, conf.level = 0, margin = 1,
main=paste("GBM (",round(cm_gbm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_ada$table, color = col, conf.level = 0, margin = 1,
main=paste("AdaBoost (",round(cm_ada$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_svm$table, color = col, conf.level = 0, margin = 1,
main=paste("SVM (",round(cm_svm$overall[1]*100),"%)",sep=""))
fourfoldplot(cm_imp_svm$table, color = col, conf.level = 0, margin = 1,
main=paste("Tune SVM (",round(cm_imp_svm$overall[1]*100),"%)",sep=""))